{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import os\n", "import math\n", "import copy" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(178, 14)" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv('normalizedwinedata.csv', header=None)\n", "df.shape" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "label = df.iloc[:, 0]\n", "axis = df.iloc[:, 1 : ]" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(178, 13)" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "axis.shape" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def initialCentroid(x, k):\n", " c_id = [np.random.choice(np.arange(len(x)))] # 第一个簇中心序号\n", " k -= 1\n", " while k > 0:\n", " dis = np.zeros(shape=(len(c_id), len(x)))\n", " for i in range(len(c_id)):\n", " dis[i] = np.sum(np.square(x - x[c_id[i]]), axis=1)\n", " min_dis = np.min(dis, axis=0)\n", " min_dis = min_dis / np.sum(min_dis)\n", " ind = np.random.choice(np.arange(len(x)), p=min_dis.ravel())\n", " k -= 1\n", " c_id.append(ind)\n", " return c_id" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1, 2, 3])" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "a = np.array([[1,2,3],[2,3,4]])\n", "np.min(a, axis=0)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "axis = np.array(axis, dtype=np.float32)\n", "c_id = initialCentroid(axis, 3)\n", "c_axis = axis[c_id]" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# axis是所有点坐标,center是k个聚类中心的坐标\n", "def diff(x, new_x):\n", " return np.sum(np.sqrt(np.sum(np.square(x - new_x), axis=1)))\n", "\n", "def kmeans(axis, center, k, epsilon):\n", " dis = np.zeros(shape=(k, len(axis)), dtype=np.float32)\n", " for i in range(k):\n", " dis[i] = np.sum(np.square(axis - center[i]), axis=1)\n", " belong = np.array([np.argmin(np.array(dis[:, i])) for i in range(dis.shape[1])], dtype=np.int32)\n", " new_center = np.zeros(shape=center.shape, dtype=np.float32)\n", " for i in range(len(belong)):\n", " new_center[belong[i]] += axis[i]\n", " for i in range(k):\n", " new_center[i] = new_center[i] / np.sum(np.array(belong == i, dtype=np.int))\n", " while diff(new_center, center) > epsilon:\n", " for i in range(k):\n", " dis[i] = np.sum(np.square(axis - new_center[i]), axis=1)\n", " belong = np.array([np.argmin(np.array(dis[:, i])) for i in range(dis.shape[1])])\n", " center = copy.deepcopy(new_center)\n", " new_center = np.zeros(shape=center.shape, dtype=np.float32)\n", " for i in range(len(belong)):\n", " new_center[belong[i]] += axis[i]\n", " for i in range(k):\n", " new_center[i] = new_center[i] / np.sum(np.array(belong == i, dtype=np.int))\n", " return belong, new_center" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "belong, new_center = kmeans(axis, c_axis, 3, 1e-10)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "48.985415041446686\n" ] } ], "source": [ "err = 0\n", "for i in range(len(axis)):\n", " err += np.sum(np.square(axis[i] - new_center[belong[i]]))\n", "print(err)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "for i in range(len(belong)):\n", " if belong[i] == 2:\n", " belong[i] = 1\n", " elif belong[i] == 1:\n", " belong[i] = 2\n", " else:\n", " belong[i] = 3\n", " " ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'np' is not defined", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0macc\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msum\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mbelong\u001b[0m \u001b[1;33m==\u001b[0m \u001b[0mlabel\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m/\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mbelong\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0macc\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;31mNameError\u001b[0m: name 'np' is not defined" ] } ], "source": [ "acc = np.sum(belong == label) / len(belong)\n", "print(acc)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 1, 1, 1, 1,\n", " 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,\n", " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n", " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n", " 2, 2], dtype=int64)" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "belong" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "def draw(x, y, axis, center, belong):\n", " color = ['r', 'b', 'g']\n", " # center[0], center[1] = center[1], center[0]\n", " for i in range(len(axis)):\n", " plt.scatter(axis[i][x], axis[i][y], c=color[belong[i] - 1], marker='o')\n", " plt.scatter(center[2][x], center[2][y], c='r', s=140, marker='*', label='cluster center1')\n", " plt.scatter(center[1][x], center[1][y], c='b', s=140, marker='*', label='cluster center2')\n", " plt.scatter(center[0][x], center[0][y], c='g', s=140, marker='*', label='cluster center3')\n", " plt.legend(fontsize=8, loc='upper right')\n", " plt.show()\n", " return None\n" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "draw(0, 1, axis, new_center, belong)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "ans_df = pd.DataFrame({'Order' : np.arange(1, len(axis) + 1),\n", " 'label' : belong})\n", "ans_df.to_csv('label.csv', index=None)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "with open('Measurement.txt', 'w') as f:\n", " f.write('Accuracy: ' + str(acc) + '\\n')\n", " f.write('Total square distance: ' + str(err))\n", "f.close()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "interpreter": { "hash": "73e8fb81fc9d21637ba62ed4f9412d39843bbeeb61edb8163afd2f9314d52c65" }, "kernelspec": { "display_name": "Python 3.7.6 64-bit (system)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }